{
 "cells": [
  {
   "cell_type": "code",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-12-25T07:22:48.187973Z",
     "start_time": "2024-12-25T07:22:40.700696Z"
    }
   },
   "source": [
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.linear_model import RidgeClassifier, LogisticRegression\n",
    "from lightgbm import LGBMClassifier\n",
    "import pandas as pd\n",
    "from sklearn.metrics import f1_score\n",
    "\n",
    "%pylab inline\n",
    "\n",
    "train_df = pd.read_csv('../input/train_set.csv', sep='\\t', nrows=None)\n",
    "test_df = pd.read_csv('../input/test_a.csv', sep='\\t', nrows=None)"
   ],
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Populating the interactive namespace from numpy and matplotlib\n"
     ]
    }
   ],
   "execution_count": 5
  },
  {
   "cell_type": "code",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-12-25T07:22:51.478035Z",
     "start_time": "2024-12-25T07:22:51.265844Z"
    }
   },
   "source": [
    "train_df['label'].value_counts().plot(kind='bar')"
   ],
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<AxesSubplot:>"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ],
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYMAAAD7CAYAAACIYvgKAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsTAAALEwEAmpwYAAAYNElEQVR4nO3df5TV9X3n8edLUKI1CuqUpYCFjbRZzJ4QnUV27Z5abXDQnEJaN4vpCdQloXvEk+ScnK6Y9hxTo7vYcxq35qh7aCBCNgkhJqmzBoNETdK0qzJGAiJaJmgCFHEqqLVmdSGv/eP7md27452ZO3PvzPDj9Tjnnvne9/fzfd/PHYZ53e+Pe0e2iYiIk9spYz2BiIgYewmDiIhIGERERMIgIiJIGEREBAmDiIgAxo/1BIbrvPPO84wZM8Z6GhERx5Unn3zyH2y39a03HAaSxgFdwH7bH5A0E9gAnAs8CXzE9luSJgDrgYuBl4F/b/uF0uMmYBlwFPi47c2l3gH8BTAO+ILtVYPNZ8aMGXR1dTU6/YiIACT9tF59KIeJPgHsqrl/O3CH7QuAw1S/5ClfD5f6HWUckmYDi4ELgQ7gbknjSsjcBSwAZgPXlrERETFKGgoDSdOAq4EvlPsCLgfuK0PWAYvK8sJyn7L+ijJ+IbDB9pu2nwe6gbnl1m17j+23qPY2Fjb5vCIiYgga3TP4r8B/An5R7p8LvGL7SLm/D5halqcCewHK+lfL+P9b77NNf/W3kbRcUpekrp6enganHhERgxk0DCR9AHjJ9pOjMJ8B2V5tu912e1vb285/RETEMDVyAvlS4HckXQW8AziL6mTvREnjy6v/acD+Mn4/MB3YJ2k8cDbVieTeeq/abfqrR0TEKBh0z8D2Tban2Z5BdQL4Edu/DzwKXFOGLQXuL8ud5T5l/SOuPhq1E1gsaUK5EmkW8ASwFZglaaak08pjdLbk2UVEREOaeZ/BjcAGSbcCTwFrSn0N8CVJ3cAhql/u2N4paSPwDHAEWGH7KICkG4DNVJeWrrW9s4l5RUTEEOl4/XsG7e3tzvsMIiKGRtKTttv71o/bdyD3Z8bKbw9p/Aurrh6hmUREHD/y2UQREZEwiIiIhEFERJAwiIgIEgYREUHCICIiSBhERAQJg4iIIGEQERGcgO9AHml5h3NEnIiyZxAREQmDiIhIGEREBAmDiIggYRARESQMIiKCBsJA0jskPSHpx5J2SvrTUr9X0vOStpXbnFKXpDsldUvaLumiml5LJe0ut6U19Ysl7Sjb3ClJI/BcIyKiH428z+BN4HLbr0s6FfihpAfLuj+yfV+f8Quo/tj9LOAS4B7gEknnADcD7YCBJyV12j5cxnwMeBzYBHQADxIREaNi0D0DV14vd08tt4H+cPJCYH3Z7jFgoqQpwJXAFtuHSgBsATrKurNsP+bqDzKvBxYN/ylFRMRQNXTOQNI4SduAl6h+oT9eVt1WDgXdIWlCqU0F9tZsvq/UBqrvq1OPiIhR0lAY2D5qew4wDZgr6T3ATcC7gX8FnAPcOFKT7CVpuaQuSV09PT0j/XARESeNIV1NZPsV4FGgw/aBcijoTeCLwNwybD8wvWazaaU2UH1anXq9x19tu912e1tb21CmHhERA2jkaqI2SRPL8unA+4Fny7F+ypU/i4CnyyadwJJyVdE84FXbB4DNwHxJkyRNAuYDm8u61yTNK72WAPe38klGRMTAGrmaaAqwTtI4qvDYaPsBSY9IagMEbAP+Yxm/CbgK6AbeAK4DsH1I0meBrWXcLbYPleXrgXuB06muIsqVRBERo2jQMLC9HXhfnfrl/Yw3sKKfdWuBtXXqXcB7BptLRESMjLwDOSIiEgYREZEwiIgIEgYREUHCICIiSBhERAQJg4iIIGEQEREkDCIigoRBRESQMIiICBIGERFBwiAiIkgYREQEjf09gxhFM1Z+u+GxL6y6egRnEhEnk+wZREREwiAiIhIGERFBA2Eg6R2SnpD0Y0k7Jf1pqc+U9Likbklfk3RaqU8o97vL+hk1vW4q9eckXVlT7yi1bkkrR+B5RkTEABrZM3gTuNz2e4E5QIekecDtwB22LwAOA8vK+GXA4VK/o4xD0mxgMXAh0AHcLWmcpHHAXcACYDZwbRkbERGjZNAwcOX1cvfUcjNwOXBfqa8DFpXlheU+Zf0VklTqG2y/aft5oBuYW27dtvfYfgvYUMZGRMQoaeicQXkFvw14CdgC/AR4xfaRMmQfMLUsTwX2ApT1rwLn1tb7bNNfvd48lkvqktTV09PTyNQjIqIBDYWB7aO25wDTqF7Jv3skJzXAPFbbbrfd3tbWNhZTiIg4IQ3paiLbrwCPAv8amCip901r04D9ZXk/MB2grD8beLm23meb/uoRETFKGrmaqE3SxLJ8OvB+YBdVKFxThi0F7i/LneU+Zf0jtl3qi8vVRjOBWcATwFZgVrk66TSqk8ydLXhuERHRoEY+jmIKsK5c9XMKsNH2A5KeATZIuhV4ClhTxq8BviSpGzhE9csd2zslbQSeAY4AK2wfBZB0A7AZGAestb2zZc8wIiIGNWgY2N4OvK9OfQ/V+YO+9f8F/Lt+et0G3FanvgnY1MB8IyJiBOQdyBERkTCIiIiEQUREkDCIiAgSBhERQcIgIiJIGEREBAmDiIggYRARESQMIiKChEFERJAwiIgIEgYREUHCICIiSBhERAQJg4iIIGEQEREkDCIiggbCQNJ0SY9KekbSTkmfKPXPSNovaVu5XVWzzU2SuiU9J+nKmnpHqXVLWllTnynp8VL/mqTTWv1EIyKif43sGRwBPmV7NjAPWCFpdll3h+055bYJoKxbDFwIdAB3SxonaRxwF7AAmA1cW9Pn9tLrAuAwsKxFzy8iIhowaBjYPmD7R2X5H4FdwNQBNlkIbLD9pu3ngW5gbrl1295j+y1gA7BQkoDLgfvK9uuARcN8PhERMQxDOmcgaQbwPuDxUrpB0nZJayVNKrWpwN6azfaVWn/1c4FXbB/pU6/3+MsldUnq6unpGcrUIyJiAA2HgaQzgW8An7T9GnAP8C5gDnAA+PORmGAt26ttt9tub2trG+mHi4g4aYxvZJCkU6mC4Mu2vwlg+2DN+r8EHih39wPTazafVmr0U38ZmChpfNk7qB0fERGjoJGriQSsAXbZ/lxNfUrNsA8CT5flTmCxpAmSZgKzgCeArcCscuXQaVQnmTttG3gUuKZsvxS4v7mnFRERQ9HInsGlwEeAHZK2ldqnqa4GmgMYeAH4QwDbOyVtBJ6huhJphe2jAJJuADYD44C1tneWfjcCGyTdCjxFFT4RETFKBg0D2z8EVGfVpgG2uQ24rU59U73tbO+hutooIiLGQN6BHBERCYOIiEgYREQECYOIiCBhEBERJAwiIoKEQUREkDCIiAgSBhERQYMfVBcnhhkrvz2k8S+sunqEZhIRx5rsGURERMIgIiISBhERQcIgIiJIGEREBAmDiIggYRARETT2N5CnS3pU0jOSdkr6RKmfI2mLpN3l66RSl6Q7JXVL2i7poppeS8v43ZKW1tQvlrSjbHNn+bvLERExShrZMzgCfMr2bGAesELSbGAl8LDtWcDD5T7AAmBWuS0H7oEqPICbgUuo/sTlzb0BUsZ8rGa7juafWkRENGrQMLB9wPaPyvI/AruAqcBCYF0Ztg5YVJYXAutdeQyYKGkKcCWwxfYh24eBLUBHWXeW7cdsG1hf0ysiIkbBkM4ZSJoBvA94HJhs+0BZ9SIwuSxPBfbWbLav1Aaq76tTj4iIUdJwGEg6E/gG8Enbr9WuK6/o3eK51ZvDckldkrp6enpG+uEiIk4aDYWBpFOpguDLtr9ZygfLIR7K15dKfT8wvWbzaaU2UH1anfrb2F5tu912e1tbWyNTj4iIBjRyNZGANcAu25+rWdUJ9F4RtBS4v6a+pFxVNA94tRxO2gzMlzSpnDieD2wu616TNK881pKaXhERMQoa+QjrS4GPADskbSu1TwOrgI2SlgE/BT5U1m0CrgK6gTeA6wBsH5L0WWBrGXeL7UNl+XrgXuB04MFyi4iIUTJoGNj+IdDfdf9X1BlvYEU/vdYCa+vUu4D3DDaXiIgYGXkHckREJAwiIiJhEBERJAwiIoKEQUREkDCIiAgSBhERQcIgIiJIGEREBAmDiIggYRARESQMIiKChEFERJAwiIgIEgYREUHCICIiSBhERAQJg4iIoIEwkLRW0kuSnq6pfUbSfknbyu2qmnU3SeqW9JykK2vqHaXWLWllTX2mpMdL/WuSTmvlE4yIiME1smdwL9BRp36H7TnltglA0mxgMXBh2eZuSeMkjQPuAhYAs4Fry1iA20uvC4DDwLJmnlBERAzdoGFg+wfAoQb7LQQ22H7T9vNANzC33Lpt77H9FrABWChJwOXAfWX7dcCioT2FiIho1vgmtr1B0hKgC/iU7cPAVOCxmjH7Sg1gb5/6JcC5wCu2j9QZ/zaSlgPLAc4///wmph4jYcbKbw9p/Aurrh6hmUTEUA33BPI9wLuAOcAB4M9bNaGB2F5tu912e1tb22g8ZETESWFYewa2D/YuS/pL4IFydz8wvWbotFKjn/rLwERJ48veQe34iIgYJcPaM5A0pebuB4HeK406gcWSJkiaCcwCngC2ArPKlUOnUZ1k7rRt4FHgmrL9UuD+4cwpIiKGb9A9A0lfBS4DzpO0D7gZuEzSHMDAC8AfAtjeKWkj8AxwBFhh+2jpcwOwGRgHrLW9szzEjcAGSbcCTwFrWvXkIiKiMYOGge1r65T7/YVt+zbgtjr1TcCmOvU9VFcbRUTEGMk7kCMiImEQEREJg4iIIGEQEREkDCIigoRBRESQMIiICBIGERFBwiAiImjuI6wjRlU+Ijti5GTPICIiEgYREZEwiIgIEgYREUHCICIiSBhERAQJg4iIoIEwkLRW0kuSnq6pnSNpi6Td5eukUpekOyV1S9ou6aKabZaW8bslLa2pXyxpR9nmTklq9ZOMiIiBNbJncC/Q0ae2EnjY9izg4XIfYAEwq9yWA/dAFR5Ufzv5Eqo/cXlzb4CUMR+r2a7vY0VExAgbNAxs/wA41Ke8EFhXltcBi2rq6115DJgoaQpwJbDF9iHbh4EtQEdZd5btx2wbWF/TKyIiRslwzxlMtn2gLL8ITC7LU4G9NeP2ldpA9X116hERMYqa/mwi25bkVkxmMJKWUx1+4vzzzx+Nh4yTSD77KE5mw90zOFgO8VC+vlTq+4HpNeOmldpA9Wl16nXZXm273XZ7W1vbMKceERF9DTcMOoHeK4KWAvfX1JeUq4rmAa+Ww0mbgfmSJpUTx/OBzWXda5LmlauIltT0ioiIUTLoYSJJXwUuA86TtI/qqqBVwEZJy4CfAh8qwzcBVwHdwBvAdQC2D0n6LLC1jLvFdu9J6euprlg6HXiw3CIiYhQNGga2r+1n1RV1xhpY0U+ftcDaOvUu4D2DzSMiIkZO3oEcERH5S2cRoyVXK8WxLHsGERGRMIiIiIRBRESQMIiICHICOeKEkRPU0YzsGURERMIgIiISBhERQcIgIiJIGEREBAmDiIggYRARESQMIiKChEFERJAwiIgI8nEUEdGgfNzFia2pPQNJL0jaIWmbpK5SO0fSFkm7y9dJpS5Jd0rqlrRd0kU1fZaW8bslLW3uKUVExFC14jDRb9meY7u93F8JPGx7FvBwuQ+wAJhVbsuBe6AKD+Bm4BJgLnBzb4BERMToGIlzBguBdWV5HbCopr7elceAiZKmAFcCW2wfsn0Y2AJ0jMC8IiKiH82GgYGHJD0paXmpTbZ9oCy/CEwuy1OBvTXb7iu1/upvI2m5pC5JXT09PU1OPSIiejV7Avk3bO+X9MvAFknP1q60bUlu8jFq+60GVgO0t7e3rG9EjL2coB5bTe0Z2N5fvr4EfIvqmP/BcviH8vWlMnw/ML1m82ml1l89IiJGybDDQNIvSXpn7zIwH3ga6AR6rwhaCtxfljuBJeWqonnAq+Vw0mZgvqRJ5cTx/FKLiIhR0sxhosnAtyT19vmK7e9I2gpslLQM+CnwoTJ+E3AV0A28AVwHYPuQpM8CW8u4W2wfamJeERExRMMOA9t7gPfWqb8MXFGnbmBFP73WAmuHO5eIiGhOPo4iIiISBhERkTCIiAgSBhERQT61NCJOEnlT28CyZxAREQmDiIhIGEREBAmDiIggYRARESQMIiKCXFoaEdESx/ulq9kziIiIhEFERCQMIiKChEFERJATyBERx4WRPkGdPYOIiDh2wkBSh6TnJHVLWjnW84mIOJkcE2EgaRxwF7AAmA1cK2n22M4qIuLkcUyEATAX6La9x/ZbwAZg4RjPKSLipCHbYz0HJF0DdNj+aLn/EeAS2zf0GbccWF7u/jrw3BAe5jzgH1ow3bHofzzPPf3TP/2Prf6/arutb/G4uprI9mpg9XC2ldRlu73FUxqV/sfz3NM//dP/+Oh/rBwm2g9Mr7k/rdQiImIUHCthsBWYJWmmpNOAxUDnGM8pIuKkcUwcJrJ9RNINwGZgHLDW9s4WP8ywDi8dI/2P57mnf/qn/3HQ/5g4gRwREWPrWDlMFBERYyhhEBERCYOIiDhGTiC3mqR3U72DeWop7Qc6be8au1k1rsx/KvC47ddr6h22v9OC/nMB295aPvajA3jW9qZme/fzeOttLxmh3r9B9Q72p20/1IJ+lwC7bL8m6XRgJXAR8Azwn22/2mT/jwPfsr232bn207/3ary/t/1dSR8G/g2wC1ht+3+34DH+OfC7VJeDHwX+DviK7dea7R1j54Q7gSzpRuBaqo+02FfK06j+g2ywvWqEH/86219sYvuPAyuo/vPOAT5h+/6y7ke2L2pyfjdTfQbUeGALcAnwKPB+YLPt25rs3/eSYAG/BTwCYPt3muz/hO25ZfljVN+rbwHzgf/R7L+vpJ3Ae8sVbquBN4D7gCtK/Xeb7P8q8E/AT4CvAl+33dNMzz79v0z1b3sG8ApwJvBNqvnL9tIm+38c+ADwA+Aq4KnyOB8Errf9vWb6xxiyfULdqF6lnFqnfhqwexQe/2dNbr8DOLMszwC6qAIB4KkWzG8H1eW7ZwCvAWeV+unA9hb0/xHw34HLgN8sXw+U5d9sQf+napa3Am1l+ZeAHS3ov6v2ufRZt60V86c6PDsfWAP0AN8BlgLvbEH/7eXreOAgMK7cV4v+fXfU9DwD+F5ZPr9FP59nA6uAZ4FDwMtUL4xWAROb7T/IYz/Ygh5nAf8F+BLw4T7r7m5B/38G3EP1wZ7nAp8p/yYbgSnN9D4Rzxn8AviVOvUpZV3TJG3v57YDmNxk+1NcDg3ZfoHql+kCSZ+j+g/drCO2j9p+A/iJy6697Z/Tmu9PO/Ak8MfAq65eKf7c9vdtf78F/U+RNEnSuVSvdHsAbP8TcKQF/Z+WdF1Z/rGkdgBJvwY0fYiF6vDcL2w/ZHsZ1c/q3VSH6va0oP8p5VDRO6l+WZ9d6hOAU1vQH/7f4eUJVHse2P5Zi/pvBA4Dl9k+x/a5VHuWh8u6pki6qJ/bxVR74s36ItX/028AiyV9Q9KEsm5eC/rfS3XIci/VHv3PqfbQ/hr4b800PhHPGXwSeFjSbqpvGFSvWi4AbuhvoyGaDFxJ9QNaS8DfNtn7oKQ5trcB2H5d0geAtcC/bLI3wFuSzihhcHFvUdLZtCAMbP8CuEPS18vXg7T25+xsqrARYElTbB+QdCatCcuPAn8h6U+oPvzrf0raS/Wz9NEW9P//5ujqGH4n0CnpjBb0X0P1qnocVSB/XdIeql9EG1rQ/wvAVkmPA/8WuB1AUhvVK/lmzbB9e23B9ovA7ZL+Qwv6bwW+T/2flYkt6P8u279Xlv9K0h8Dj0hq6vBojcm2Pw8g6fqa79XnJS1rpvEJd84AQNIpVCcVa08gb7V9tEX91wBftP3DOuu+YvvDTfSeRvXq/cU66y61/TfD7V16TLD9Zp36eVS7mTua6V+n79XApbY/3cq+dR7nDKr/KM+3qN9ZwEyqINtn+2CL+v6a7b9rRa8BHuNXAGz/vaSJwG9THb58okX9LwT+BdVJ+2db0bOm90PAd4F1vd9zSZOBPwDeb/u3m+z/NPBB27vrrNtre3qdzYbSfxdwYXlR1Fv7A+CPqA7//mqT/X9s+71l+Vbbf1KzboftYb9gPCHDICKOT5ImUV3BtRD45VI+SLX3tMp2373xofa/hurc0ts+/l7SItt/1WT/PwMesv3dPvUO4PO2ZzXZ/xbgz1xzlWGpX0D1/blm2L0TBhFxPGj2Sr30H2T7hEFEHA8k/cz2+ek/Mv1PxBPIEXGckrS9v1U0f6Ve+g8gYRARx5KRvFIv/QeQMIiIY8kDVFfdbOu7QtL30n/k+uecQUREnJDvQI6IiCFKGERERMIgIiISBhERQcIgIiKA/wPcKfrC8AnCYAAAAABJRU5ErkJggg==\n"
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "execution_count": 6
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-08-05T12:38:22.259142Z",
     "start_time": "2020-08-05T12:38:21.745661Z"
    }
   },
   "source": [
    "# 统计最多多少个字符"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-12-25T07:34:55.239056Z",
     "start_time": "2024-12-25T07:28:11.808602Z"
    }
   },
   "source": [
    "all_lines = ' '.join(train_df['text']) + ' ' + ' '.join(test_df['text'])\n",
    "all_words = np.array(all_lines.split()).astype(int)"
   ],
   "outputs": [],
   "execution_count": 8
  },
  {
   "cell_type": "code",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-12-25T07:39:53.307934Z",
     "start_time": "2024-12-25T07:39:53.148388Z"
    }
   },
   "source": [
    "all_words.max()"
   ],
   "outputs": [
    {
     "data": {
      "text/plain": [
       "7549"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 9
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 统计每类新闻字符个数"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-12-25T06:53:34.504920Z",
     "start_time": "2024-12-25T06:52:28.135020Z"
    }
   },
   "source": [
    "train_df['text'] = train_df['text'].apply(lambda x: np.array(x.split(' ')).astype(int))\n",
    "test_df['text'] = test_df['text'].apply(lambda x: np.array(x.split(' ')).astype(int))"
   ],
   "outputs": [],
   "execution_count": 9
  },
  {
   "cell_type": "code",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-12-25T06:53:45.312798Z",
     "start_time": "2024-12-25T06:53:43.969123Z"
    }
   },
   "source": [
    "all_word_vec = np.zeros((14, 7550), dtype=np.float32)\n",
    "\n",
    "for row in train_df.itertuples():\n",
    "    all_word_vec[row.label, row.text] += 1\n",
    "    \n",
    "all_word_vec /= train_df.shape[0]"
   ],
   "outputs": [],
   "execution_count": 10
  },
  {
   "cell_type": "code",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-12-25T06:53:47.670291Z",
     "start_time": "2024-12-25T06:53:47.662755Z"
    }
   },
   "source": [
    "for idx in range(14):\n",
    "    print(idx, all_word_vec[idx].argsort()[-10:])"
   ],
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0 [4811 3659 4464 7399 3370 6122 2465  648  900 3750]\n",
      "1 [6065  299 4853 2465 3659  648 4464 3370 3750  900]\n",
      "2 [4939 5598 5445 4704 6122 2465 7399  648  900 3750]\n",
      "3 [ 669 4811 4853 4893 6122 7399 2465  648  900 3750]\n",
      "4 [2400 4411 6122 4464 2073 7399  648 4853 3750  900]\n",
      "5 [ 299 5977 4659 5598 4893 7399 6122  648 3750  900]\n",
      "6 [6122 2555 6248 1519 1699 4811 5620  648  900 3750]\n",
      "7 [ 299 2400 4811 3659 4464 1699 3370  648 3750  900]\n",
      "8 [5598  669  913 4811 4939 7399 6122  900 3750  648]\n",
      "9 [2109 5598 4939 4811 7328 6122 7399  900  648 3750]\n",
      "10 [ 299 4464 2465 3370 1699 3686  648  885 3750  900]\n",
      "11 [4893 7399 4811  669 4939 7539 6122  900  648 3750]\n",
      "12 [6065 5602 2400 2465 3659 7539  900 3750 4464 3370]\n",
      "13 [3864 3870 4939  900 1580 7539 5491 2662 3750  648]\n"
     ]
    }
   ],
   "execution_count": 11
  },
  {
   "cell_type": "code",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-12-25T06:53:49.926094Z",
     "start_time": "2024-12-25T06:53:49.920357Z"
    }
   },
   "source": [
    "np.where(train_df['text'].iloc[0] == 3750)"
   ],
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(array([  35,   48,   64,   74,  106,  115,  153,  168,  178,  185,  216,\n",
       "         244,  251,  264,  286,  291,  302,  331,  347,  361,  387,  418,\n",
       "         431,  441,  466,  505,  512,  521,  532,  548,  564,  577,  586,\n",
       "         613,  616,  622,  634,  679,  697,  721,  745,  755,  761,  782,\n",
       "         793,  803,  828,  836,  842,  856,  861,  884,  897,  924,  928,\n",
       "         939,  949,  963,  968,  980, 1011, 1029, 1044, 1051]),)"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 12
  },
  {
   "cell_type": "code",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-12-25T06:53:51.294549Z",
     "start_time": "2024-12-25T06:53:51.290565Z"
    }
   },
   "source": [
    "np.where(train_df['text'].iloc[0] == 900)"
   ],
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(array([  61,   96,  141,  198,  237,  258,  279,  313,  378,  403,  457,\n",
       "         481,  490,  557,  597,  631,  647,  667,  703,  713,  775,  809,\n",
       "         849,  869,  891,  910,  976, 1018, 1047, 1055]),)"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 13
  },
  {
   "cell_type": "code",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-12-25T06:53:52.446715Z",
     "start_time": "2024-12-25T06:53:52.442287Z"
    }
   },
   "source": [
    "all_word_vec.sum(0).argsort()[-10:]"
   ],
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([3659, 1699, 4464, 4811, 7399, 6122, 2465,  648,  900, 3750])"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "execution_count": 14
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# baseline加强版"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-12-25T06:54:00.663829Z",
     "start_time": "2024-12-25T06:53:54.043556Z"
    }
   },
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "from sklearn.model_selection import train_test_split, KFold, StratifiedKFold\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.svm import LinearSVC\n",
    "from sklearn.metrics import f1_score\n",
    "\n",
    "train_df = pd.read_csv('../input/train_set.csv', sep='\\t', nrows=None)\n",
    "test_df = pd.read_csv('../input/test_a.csv', sep='\\t', nrows=None)"
   ],
   "outputs": [],
   "execution_count": 15
  },
  {
   "cell_type": "code",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-12-25T07:12:11.083163Z",
     "start_time": "2024-12-25T06:54:04.491812Z"
    }
   },
   "source": [
    "tfidf = TfidfVectorizer(ngram_range=(1, 3), max_features=10000).fit(train_df['text'].iloc[:].values)\n",
    "train_tfidf = tfidf.transform(train_df['text'].iloc[:].values)\n",
    "test_tfidf = tfidf.transform(test_df['text'].iloc[:].values)\n",
    "\n",
    "skf = StratifiedKFold(n_splits=5, random_state=7) \n",
    "test_pred = np.zeros((test_tfidf.shape[0], 14), dtype=np.float32)  \n",
    "for idx, (train_index, valid_index) in enumerate(skf.split(train_tfidf, train_df['label'].values)):\n",
    "\n",
    "    x_train_, x_valid_ = train_tfidf[train_index], train_tfidf[valid_index]\n",
    "    y_train_, y_valid_ = train_df['label'].values[train_index], train_df['label'].values[valid_index]\n",
    "    \n",
    "    clf = LGBMClassifier()\n",
    "    clf.fit(x_train_, y_train_)\n",
    "    val_pred = clf.predict(x_valid_)\n",
    "    \n",
    "    print(f1_score(y_valid_, val_pred, average='macro'))\n",
    "    test_pred += clf.predict_proba(test_tfidf)"
   ],
   "outputs": [
    {
     "ename": "ValueError",
     "evalue": "Setting a random_state has no effect since shuffle is False. You should leave random_state to its default (None), or set shuffle=True.",
     "output_type": "error",
     "traceback": [
      "\u001B[1;31m---------------------------------------------------------------------------\u001B[0m",
      "\u001B[1;31mValueError\u001B[0m                                Traceback (most recent call last)",
      "Cell \u001B[1;32mIn[16], line 5\u001B[0m\n\u001B[0;32m      2\u001B[0m train_tfidf \u001B[38;5;241m=\u001B[39m tfidf\u001B[38;5;241m.\u001B[39mtransform(train_df[\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mtext\u001B[39m\u001B[38;5;124m'\u001B[39m]\u001B[38;5;241m.\u001B[39miloc[:]\u001B[38;5;241m.\u001B[39mvalues)\n\u001B[0;32m      3\u001B[0m test_tfidf \u001B[38;5;241m=\u001B[39m tfidf\u001B[38;5;241m.\u001B[39mtransform(test_df[\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mtext\u001B[39m\u001B[38;5;124m'\u001B[39m]\u001B[38;5;241m.\u001B[39miloc[:]\u001B[38;5;241m.\u001B[39mvalues)\n\u001B[1;32m----> 5\u001B[0m skf \u001B[38;5;241m=\u001B[39m \u001B[43mStratifiedKFold\u001B[49m\u001B[43m(\u001B[49m\u001B[43mn_splits\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[38;5;241;43m5\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mrandom_state\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[38;5;241;43m7\u001B[39;49m\u001B[43m)\u001B[49m \n\u001B[0;32m      6\u001B[0m test_pred \u001B[38;5;241m=\u001B[39m np\u001B[38;5;241m.\u001B[39mzeros((test_tfidf\u001B[38;5;241m.\u001B[39mshape[\u001B[38;5;241m0\u001B[39m], \u001B[38;5;241m14\u001B[39m), dtype\u001B[38;5;241m=\u001B[39mnp\u001B[38;5;241m.\u001B[39mfloat32)  \n\u001B[0;32m      7\u001B[0m \u001B[38;5;28;01mfor\u001B[39;00m idx, (train_index, valid_index) \u001B[38;5;129;01min\u001B[39;00m \u001B[38;5;28menumerate\u001B[39m(skf\u001B[38;5;241m.\u001B[39msplit(train_tfidf, train_df[\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mlabel\u001B[39m\u001B[38;5;124m'\u001B[39m]\u001B[38;5;241m.\u001B[39mvalues)):\n",
      "File \u001B[1;32mD:\\python2\\Lib\\site-packages\\sklearn\\model_selection\\_split.py:744\u001B[0m, in \u001B[0;36mStratifiedKFold.__init__\u001B[1;34m(self, n_splits, shuffle, random_state)\u001B[0m\n\u001B[0;32m    743\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21m__init__\u001B[39m(\u001B[38;5;28mself\u001B[39m, n_splits\u001B[38;5;241m=\u001B[39m\u001B[38;5;241m5\u001B[39m, \u001B[38;5;241m*\u001B[39m, shuffle\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mFalse\u001B[39;00m, random_state\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mNone\u001B[39;00m):\n\u001B[1;32m--> 744\u001B[0m     \u001B[38;5;28;43msuper\u001B[39;49m\u001B[43m(\u001B[49m\u001B[43m)\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[38;5;21;43m__init__\u001B[39;49m\u001B[43m(\u001B[49m\u001B[43mn_splits\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mn_splits\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mshuffle\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mshuffle\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mrandom_state\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mrandom_state\u001B[49m\u001B[43m)\u001B[49m\n",
      "File \u001B[1;32mD:\\python2\\Lib\\site-packages\\sklearn\\model_selection\\_split.py:370\u001B[0m, in \u001B[0;36m_BaseKFold.__init__\u001B[1;34m(self, n_splits, shuffle, random_state)\u001B[0m\n\u001B[0;32m    367\u001B[0m     \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mTypeError\u001B[39;00m(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mshuffle must be True or False; got \u001B[39m\u001B[38;5;132;01m{0}\u001B[39;00m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;241m.\u001B[39mformat(shuffle))\n\u001B[0;32m    369\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m shuffle \u001B[38;5;129;01mand\u001B[39;00m random_state \u001B[38;5;129;01mis\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;28;01mNone\u001B[39;00m:  \u001B[38;5;66;03m# None is the default\u001B[39;00m\n\u001B[1;32m--> 370\u001B[0m     \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mValueError\u001B[39;00m(\n\u001B[0;32m    371\u001B[0m         (\n\u001B[0;32m    372\u001B[0m             \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mSetting a random_state has no effect since shuffle is \u001B[39m\u001B[38;5;124m\"\u001B[39m\n\u001B[0;32m    373\u001B[0m             \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mFalse. You should leave \u001B[39m\u001B[38;5;124m\"\u001B[39m\n\u001B[0;32m    374\u001B[0m             \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mrandom_state to its default (None), or set shuffle=True.\u001B[39m\u001B[38;5;124m\"\u001B[39m\n\u001B[0;32m    375\u001B[0m         ),\n\u001B[0;32m    376\u001B[0m     )\n\u001B[0;32m    378\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mn_splits \u001B[38;5;241m=\u001B[39m n_splits\n\u001B[0;32m    379\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mshuffle \u001B[38;5;241m=\u001B[39m shuffle\n",
      "\u001B[1;31mValueError\u001B[0m: Setting a random_state has no effect since shuffle is False. You should leave random_state to its default (None), or set shuffle=True."
     ]
    }
   ],
   "execution_count": 16
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-12-25T06:52:07.963930600Z",
     "start_time": "2020-08-06T01:32:03.666578Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[2.43331306e-03, 4.98582077e+00, 8.70002899e-04, ...,\n",
       "        1.79282186e-04, 3.83838706e-05, 1.42652807e-05],\n",
       "       [9.62744525e-04, 5.30660851e-04, 4.99282217e+00, ...,\n",
       "        1.09481793e-04, 5.39226567e-05, 3.21293896e-06],\n",
       "       [6.14396529e-03, 1.20122253e-03, 4.03519371e-04, ...,\n",
       "        2.68263917e-04, 1.42693880e-05, 5.24245479e-06],\n",
       "       ...,\n",
       "       [1.52684171e-02, 4.40746832e+00, 5.02393302e-03, ...,\n",
       "        1.00517226e-03, 2.04726894e-04, 7.74227083e-05],\n",
       "       [1.22115649e-02, 1.91734277e-03, 1.79137068e-03, ...,\n",
       "        1.72756307e-04, 1.54060166e-04, 1.05225445e-05],\n",
       "       [1.18725467e-02, 4.96777058e+00, 7.75435183e-04, ...,\n",
       "        1.73424472e-04, 2.99724416e-05, 1.18266225e-05]], dtype=float32)"
      ]
     },
     "execution_count": 111,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_pred"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 113,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-12-25T06:52:07.967926800Z",
     "start_time": "2020-08-06T01:32:24.252930Z"
    },
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "df = pd.DataFrame()\n",
    "df['label'] = test_pred.argmax(1)\n",
    "df.to_csv('submit.csv', index=None)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# TFIDF"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "- https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html\n",
    "- https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html\n",
    "- http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-12-25T07:40:04.296456Z",
     "start_time": "2024-12-25T07:40:04.245236Z"
    }
   },
   "source": [
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "corpus = [\n",
    "    'This is the first document.',\n",
    "    'This document is the second document.',\n",
    "    'And this is the third one.',\n",
    "    'Is this the first document?',\n",
    "]\n",
    "vectorizer = CountVectorizer()\n",
    "X = vectorizer.fit_transform(corpus)\n",
    "print(vectorizer.get_feature_names())\n",
    "print(X.toarray())\n",
    "\n",
    "vectorizer2 = CountVectorizer(analyzer='word', ngram_range=(1, 2))\n",
    "X2 = vectorizer2.fit_transform(corpus)\n",
    "print(vectorizer2.get_feature_names())\n",
    "print(X2.toarray())"
   ],
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']\n",
      "[[0 1 1 1 0 0 1 0 1]\n",
      " [0 2 0 1 0 1 1 0 1]\n",
      " [1 0 0 1 1 0 1 1 1]\n",
      " [0 1 1 1 0 0 1 0 1]]\n",
      "['and', 'and this', 'document', 'document is', 'first', 'first document', 'is', 'is the', 'is this', 'one', 'second', 'second document', 'the', 'the first', 'the second', 'the third', 'third', 'third one', 'this', 'this document', 'this is', 'this the']\n",
      "[[0 0 1 0 1 1 1 1 0 0 0 0 1 1 0 0 0 0 1 0 1 0]\n",
      " [0 0 2 1 0 0 1 1 0 0 1 1 1 0 1 0 0 0 1 1 0 0]\n",
      " [1 1 0 0 0 0 1 1 0 1 0 0 1 0 0 1 1 1 1 0 1 0]\n",
      " [0 0 1 0 1 1 1 0 1 0 0 0 1 1 0 0 0 0 1 0 0 1]]\n"
     ]
    }
   ],
   "execution_count": 10
  },
  {
   "cell_type": "code",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-12-25T07:40:06.798088Z",
     "start_time": "2024-12-25T07:40:06.736454Z"
    }
   },
   "source": [
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "corpus = [\n",
    "    'This is the first document.',\n",
    "    'This document is the second document.',\n",
    "    'And this is the third one.',\n",
    "    'Is this the first document?',\n",
    "]\n",
    "vectorizer = TfidfVectorizer()\n",
    "X = vectorizer.fit_transform(corpus)\n",
    "print(vectorizer.get_feature_names())\n",
    "\n",
    "print(X.shape)"
   ],
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']\n",
      "(4, 9)\n"
     ]
    }
   ],
   "execution_count": 11
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# FastText"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {
    "scrolled": true,
    "ExecuteTime": {
     "end_time": "2024-12-25T07:40:28.690563Z",
     "start_time": "2024-12-25T07:40:09.030145Z"
    }
   },
   "source": [
    "train_df = pd.read_csv('../input/train_set.csv', sep='\\t', nrows=None)\n",
    "train_df['label_ft'] = '__label__' + train_df['label'].astype(str)\n",
    "train_df[['text','label_ft']].iloc[:-5000].to_csv('train.csv', index=None, header=None, sep='\\t')"
   ],
   "outputs": [],
   "execution_count": 12
  },
  {
   "cell_type": "code",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-12-25T07:56:40.408395Z",
     "start_time": "2024-12-25T07:40:37.072642Z"
    }
   },
   "source": [
    "import fasttext\n",
    "model = fasttext.train_supervised('train.csv', lr=1.0, wordNgrams=5, \n",
    "                                  verbose=2, minCount=1, epoch=25, loss=\"hs\")\n",
    "\n",
    "val_pred = [model.predict(x)[0][0].split('__')[-1] for x in train_df.iloc[-5000:]['text']]\n",
    "print(f1_score(train_df['label'].values[-5000:].astype(str), val_pred, average='macro'))"
   ],
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.9152394194028809\n"
     ]
    }
   ],
   "execution_count": 13
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.5"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
